Data
# Download file for state info
url = "https://opendata.arcgis.com/datasets/5f45e1ece6e14ef5866974a7b57d3b95_1.geojson"
file = "NJ_counties.geojson"
download.file(url,file)
trying URL 'https://opendata.arcgis.com/datasets/5f45e1ece6e14ef5866974a7b57d3b95_1.geojson'
Content type 'application/json' length unknown
downloaded 8.4 MB
rm(url)
# Convert file to sf object
file = "NJ_counties.geojson"
NJ_Counties <-
geojson_sf(file)
rm(file)
# Clean Data
NJ_Counties_Cleaned <-
NJ_Counties %>%
transmute(
county = COUNTY,
CO = CO,
pop = POP2010,
popdensity = POPDEN2010,
Shape_Length = Shape_Length,
Shape_Area = Shape_Area,
GNIS = GNIS
)
#get page source from website
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 4676000 249.8 7330683 391.6 7330683 391.6
Vcells 16384116 125.1 34559173 263.7 34559173 263.7
driver <- rsDriver(browser = c("firefox"), port = 44454L)
checking Selenium Server versions:
BEGIN: PREDOWNLOAD
BEGIN: DOWNLOAD
BEGIN: POSTDOWNLOAD
checking chromedriver versions:
BEGIN: PREDOWNLOAD
BEGIN: DOWNLOAD
BEGIN: POSTDOWNLOAD
checking geckodriver versions:
BEGIN: PREDOWNLOAD
BEGIN: DOWNLOAD
BEGIN: POSTDOWNLOAD
checking phantomjs versions:
BEGIN: PREDOWNLOAD
BEGIN: DOWNLOAD
BEGIN: POSTDOWNLOAD
[1] "Connecting to remote server"
$acceptInsecureCerts
[1] FALSE
$browserName
[1] "firefox"
$browserVersion
[1] "84.0"
$`moz:accessibilityChecks`
[1] FALSE
$`moz:buildID`
[1] "20201211215739"
$`moz:geckodriverVersion`
[1] "0.28.0"
$`moz:headless`
[1] FALSE
$`moz:processID`
[1] 2128
$`moz:profile`
[1] "C:\\Users\\Devansh\\AppData\\Local\\Temp\\rust_mozprofilex2sKxq"
$`moz:shutdownTimeout`
[1] 60000
$`moz:useNonSpecCompliantPointerOrigin`
[1] FALSE
$`moz:webdriverClick`
[1] TRUE
$pageLoadStrategy
[1] "normal"
$platformName
[1] "windows"
$platformVersion
[1] "10.0"
$rotatable
[1] FALSE
$setWindowRect
[1] TRUE
$strictFileInteractability
[1] FALSE
$timeouts
$timeouts$implicit
[1] 0
$timeouts$pageLoad
[1] 300000
$timeouts$script
[1] 30000
$unhandledPromptBehavior
[1] "dismiss and notify"
$webdriver.remote.sessionid
[1] "79262589-c403-4b36-b334-63af6392461e"
$id
[1] "79262589-c403-4b36-b334-63af6392461e"
remote_driver <- driver[["client"]]
remote_driver$navigate("https://www.childrens-specialized.org/locations-directory/?")
page <- remote_driver$getPageSource()
# Retrieve information from directory
Xpathgen1 = "/html/body/div[1]/div/div/div[2]/div/div[2]/div["
Xpathgen2 = "]/div/div[2]/article"
Hosinfo <- data.frame()
for (i in 1:15){
XPath <- paste(Xpathgen1,i,Xpathgen2,sep = "")
Node <- page[[1]] %>%
read_html() %>%
html_nodes(xpath = XPath)
name <-
Node[[1]] %>%
html_node("h2") %>%
html_text()
address <-
Node[[1]] %>%
html_node("h3") %>%
html_text() %>%
gsub(pattern = "\n *",replacement = " ", x = .)
for (i in 1:7){
XPathday <- paste(XPath,"/div[",i,"]",sep = "")
day <- page[[1]] %>%
read_html() %>%
html_nodes(xpath = XPathday) %>%
html_attr("class") %>%
grep("-Hours",x = .,value = TRUE) %>%
gsub("-Hours","",x = .)
times <-
page[[1]] %>%
read_html() %>%
html_nodes(xpath = XPathday) %>%
html_node("h3") %>%
html_text()
assign(day,times)
}
row = data.frame(name,address,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday)
Hosinfo <- rbind(Hosinfo,row)
}
# Write csv file
write.csv(Hosinfo, "Hospitals.csv")
# Data Wrangling
Hosinfo <- read.csv("Hospitals.csv")
pattern <- "([0-1]*[0-9]:[0-5][0-9] *[AaPp][Mm][-to ]+[0-1]*[0-9]:[0-5][0-9] *[AaPp][Mm])"
Hosloc <-
Hosinfo %>%
select(name,address) %>%
mutate_geocode(address) # Requires google API key
rm(pattern)
write.csv(Hosloc,"Hospitalsloc.csv")